{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "Stacking with a Neural Network"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from __future__ import division\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    " \n",
    "from sklearn.datasets import fetch_california_housing\n",
    " \n",
    "#From within an ipython notebook\n",
    "%matplotlib inline\n",
    " \n",
    "cali_housing = fetch_california_housing()\n",
    " \n",
    "X = cali_housing.data\n",
    "y = cali_housing.target\n",
    "\n",
    "bins = np.arange(6)\n",
    "binned_y = np.digitize(y, bins)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "X_train_prin, X_test_prin, y_train_prin, y_test_prin = train_test_split(X, y,test_size=0.2,stratify=binned_y,random_state=7)\n",
    "\n",
    "binned_y_train_prin = np.digitize(y_train_prin, bins)\n",
    "\n",
    "X_1, X_stack, y_1, y_stack = train_test_split(X_train_prin,y_train_prin,test_size=0.33,stratify=binned_y_train_prin,random_state=7 )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv=3, error_score='raise',\n",
       "       estimator=Pipeline(memory=None,\n",
       "     steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('neural_net', MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,\n",
       "       beta_2=0.999, early_stopping=False, epsilon=1e-08,\n",
       "       hidden_layer_sizes=(100,), learning_rate='constant',\n",
       "       learnin...=True, solver='adam', tol=0.0001, validation_fraction=0.1,\n",
       "       verbose=False, warm_start=False))]),\n",
       "       fit_params=None, iid=True, n_jobs=-1,\n",
       "       param_grid={'neural_net__alpha': [0.02, 0.01, 0.005], 'neural_net__activation': ['relu'], 'neural_net__solver': ['adam'], 'neural_net__hidden_layer_sizes': [(50, 50, 50)]},\n",
       "       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,\n",
       "       scoring=None, verbose=0)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.model_selection import GridSearchCV\n",
    "from sklearn.neural_network import MLPRegressor\n",
    " \n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.preprocessing import StandardScaler\n",
    " \n",
    "mlp_pipe = Pipeline(steps=[('scale', StandardScaler()), ('neural_net', MLPRegressor())])\n",
    " \n",
    " \n",
    "param_grid = {'neural_net__alpha': [0.02,0.01,0.005],\n",
    "               'neural_net__hidden_layer_sizes' : [(50,50,50)],\n",
    "               'neural_net__activation': ['relu'],\n",
    "               'neural_net__solver' : ['adam']\n",
    "               }\n",
    " \n",
    "neural_net_gs = GridSearchCV(mlp_pipe, param_grid = param_grid,cv=3, n_jobs=-1)\n",
    "neural_net_gs.fit(X_1, y_1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'neural_net__activation': 'relu',\n",
       " 'neural_net__alpha': 0.005,\n",
       " 'neural_net__hidden_layer_sizes': (50, 50, 50),\n",
       " 'neural_net__solver': 'adam'}"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "neural_net_gs.best_params_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.77494880692463708"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "neural_net_gs.best_score_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "nn_best = neural_net_gs.best_estimator_\n",
    "import pickle\n",
    " \n",
    "f = open('nn_best.save', 'wb')\n",
    "pickle.dump(nn_best, f, protocol = pickle.HIGHEST_PROTOCOL)\n",
    "f.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RandomizedSearchCV(cv=3, error_score='raise',\n",
       "          estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,\n",
       "             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,\n",
       "             max_leaf_nodes=None, min_impurity_decrease=0.0,\n",
       "             min_impurity_split=None, min_samples_leaf=1,\n",
       "             min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
       "             n_estimators=100, presort='auto', random_state=None,\n",
       "             subsample=1.0, verbose=0, warm_start=False),\n",
       "          fit_params=None, iid=True, n_iter=25, n_jobs=-1,\n",
       "          param_distributions={'n_estimators': [100], 'loss': ['huber'], 'warm_start': [True], 'random_state': [7], 'max_features': [0.4, 0.6, 0.8, 1.0], 'learning_rate': [0.1, 0.05, 0.03, 0.01], 'max_depth': [5, 7, 10], 'min_samples_leaf': [2, 3, 5]},\n",
       "          pre_dispatch='2*n_jobs', random_state=None, refit=True,\n",
       "          return_train_score=True, scoring=None, verbose=0)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.model_selection import RandomizedSearchCV\n",
    "from sklearn.ensemble import GradientBoostingRegressor\n",
    " \n",
    " \n",
    "param_grid = {'learning_rate': [0.1,0.05,0.03,0.01],\n",
    "               'loss': ['huber'],\n",
    "               'max_depth': [5,7,10],\n",
    "               'max_features': [0.4,0.6,0.8,1.0],\n",
    "               'min_samples_leaf': [2,3,5],\n",
    "               'n_estimators': [100],\n",
    "               'warm_start': [True], 'random_state':[7]\n",
    "               }\n",
    " \n",
    "boost_gs = RandomizedSearchCV(GradientBoostingRegressor(), param_distributions = param_grid,cv=3, n_jobs=-1,n_iter=25)\n",
    "boost_gs.fit(X_1, y_1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.82477941207056227"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "boost_gs.best_score_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'learning_rate': 0.1,\n",
       " 'loss': 'huber',\n",
       " 'max_depth': 10,\n",
       " 'max_features': 0.4,\n",
       " 'min_samples_leaf': 2,\n",
       " 'n_estimators': 100,\n",
       " 'random_state': 7,\n",
       " 'warm_start': True}"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "boost_gs.best_params_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "gbt_inst = GradientBoostingRegressor(**{'learning_rate': 0.1,\n",
    " 'loss': 'huber',\n",
    " 'max_depth': 10,\n",
    " 'max_features': 0.4,\n",
    " 'min_samples_leaf': 5,\n",
    " 'n_estimators': 4000,\n",
    " 'warm_start': True, 'random_state':7}).fit(X_1, y_1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def pickle_func(filename, saved_object):\n",
    "    import pickle\n",
    " \n",
    "    f = open(filename, 'wb')\n",
    "    pickle.dump(saved_object, f, protocol = pickle.HIGHEST_PROTOCOL)\n",
    "    f.close()\n",
    "     \n",
    "    return None\n",
    " \n",
    "pickle_func('grad_boost.save', gbt_inst)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RandomizedSearchCV(cv=3, error_score='raise',\n",
       "          estimator=BaggingRegressor(base_estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,\n",
       "             learning_rate=0.1, loss='huber', max_depth=10,\n",
       "             max_features=0.4, max_leaf_nodes=None,\n",
       "             min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "       ...n_estimators=10, n_jobs=1, oob_score=False,\n",
       "         random_state=None, verbose=0, warm_start=False),\n",
       "          fit_params=None, iid=True, n_iter=5, n_jobs=-1,\n",
       "          param_distributions={'max_features': [0.5, 1.0], 'max_samples': [0.5, 1.0], 'oob_score': [True, False], 'n_estimators': [20], 'base_estimator__min_samples_leaf': [4, 5]},\n",
       "          pre_dispatch='2*n_jobs', random_state=None, refit=True,\n",
       "          return_train_score=True, scoring=None, verbose=0)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.ensemble import BaggingRegressor,GradientBoostingRegressor\n",
    "from sklearn.model_selection import RandomizedSearchCV\n",
    " \n",
    " \n",
    "param_dist = {\n",
    "     'max_samples': [0.5,1.0],\n",
    "     'max_features' : [0.5,1.0],\n",
    "     'oob_score' : [True, False],\n",
    "     'base_estimator__min_samples_leaf': [4,5],\n",
    "     'n_estimators': [20]}\n",
    " \n",
    " \n",
    "\n",
    "single_estimator = GradientBoostingRegressor(**{'learning_rate': 0.1,\n",
    " 'loss': 'huber',\n",
    " 'max_depth': 10,\n",
    " 'max_features': 0.4,\n",
    " 'n_estimators': 20,\n",
    " 'warm_start': True, 'random_state':7})\n",
    " \n",
    "ensemble_estimator = BaggingRegressor(base_estimator = single_estimator)\n",
    " \n",
    " \n",
    "pre_gs_inst_bag = RandomizedSearchCV(ensemble_estimator,\n",
    "  param_distributions = param_dist,\n",
    "  cv=3,\n",
    "  n_iter = 5,\n",
    "  n_jobs=-1)\n",
    " \n",
    "pre_gs_inst_bag.fit(X_1, y_1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.78178711839468873"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pre_gs_inst_bag.best_score_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "pickle_func('bag_gbm.save', pre_gs_inst_bag.best_estimator_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def pickle_load_func(filename):\n",
    "    f = open(filename, 'rb')\n",
    "    to_return = pickle.load(f)\n",
    "    f.close()\n",
    "    \n",
    "    return to_return\n",
    "\n",
    "neural_net = pickle_load_func('nn_best.save')\n",
    "gbt = pickle_load_func('grad_boost.save')\n",
    "bag_gbm = pickle_load_func('bag_gbm.save')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "           nn       gbt       bag\n",
      "nn   1.000000  0.950080  0.964221\n",
      "gbt  0.950080  1.000000  0.981978\n",
      "bag  0.964221  0.981978  1.000000\n"
     ]
    }
   ],
   "source": [
    "def handle_X_set(X_train_set_in):\n",
    "    X_train_set = X_train_set_in.copy()\n",
    "    \n",
    "    y_pred_nn = neural_net.predict(X_train_set)\n",
    "    y_pred_gbt = gbt.predict(X_train_set)\n",
    "    y_pred_bag = bag_gbm.predict(X_train_set)\n",
    "    \n",
    "    \n",
    "    preds_df = pd.DataFrame(columns = ['nn', 'gbt','bag'])\n",
    "\n",
    "    preds_df['nn'] = y_pred_nn\n",
    "    preds_df['gbt'] = y_pred_gbt\n",
    "    preds_df['bag'] = y_pred_bag\n",
    " \n",
    "    return preds_df\n",
    "\n",
    "def predict_from_X_set(X_train_set_in):\n",
    "    X_train_set = X_train_set_in.copy()    \n",
    "    return final_etr.predict(handle_X_set(X_train_set)) \n",
    "\n",
    "preds_df = handle_X_set(X_stack)\n",
    "print (preds_df.corr())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'max_features': 1.0,\n",
       " 'min_samples_leaf': 11,\n",
       " 'n_estimators': 100,\n",
       " 'oob_score': False}"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.ensemble import ExtraTreesRegressor\n",
    "from sklearn.model_selection import RandomizedSearchCV\n",
    " \n",
    "param_dist = {'max_features' : ['sqrt','log2',1.0],\n",
    "  'min_samples_leaf' : [1, 2, 3, 7, 11],\n",
    "  'n_estimators': [50, 100],\n",
    "  'oob_score': [True, False]}\n",
    " \n",
    "pre_gs_inst = RandomizedSearchCV(ExtraTreesRegressor(warm_start=True,bootstrap=True,random_state=7),\n",
    "  param_distributions = param_dist,\n",
    "  cv=3,\n",
    "  n_iter = 15,random_state=7)\n",
    " \n",
    "pre_gs_inst.fit(preds_df.values, y_stack)\n",
    "\n",
    "pre_gs_inst.best_params_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "final_etr = ExtraTreesRegressor(**{'max_features': 1.0,\n",
    "  'min_samples_leaf': 11,\n",
    "  'n_estimators': 3000,\n",
    "  'oob_score': False, 'random_state':7}).fit(preds_df.values, y_stack)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.82358648513122856"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.model_selection import cross_val_score\n",
    "\n",
    "cross_val_score(final_etr, preds_df.values, y_stack, cv=3).mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "R-squared 0.840571369604\n",
      "MAE :  0.301872336955\n",
      "MAPE :  0.167229414954\n"
     ]
    }
   ],
   "source": [
    "y_pred = predict_from_X_set(X_test_prin)\n",
    " \n",
    "from sklearn.metrics import r2_score, mean_absolute_error\n",
    " \n",
    "print \"R-squared\",r2_score(y_test_prin, y_pred)\n",
    "print \"MAE : \",mean_absolute_error(y_test_prin, y_pred)\n",
    "print \"MAPE : \",(np.abs(y_test_prin- y_pred)/y_test_prin).mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['MedInc',\n",
       " 'HouseAge',\n",
       " 'AveRooms',\n",
       " 'AveBedrms',\n",
       " 'Population',\n",
       " 'AveOccup',\n",
       " 'Latitude',\n",
       " 'Longitude']"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def handle_X_set_sp(X_train_set_in):\n",
    "    X_train_set = X_train_set_in.copy()\n",
    "    \n",
    "    y_pred_nn = neural_net.predict(X_train_set)\n",
    "    y_pred_gbt = gbt.predict(X_train_set)\n",
    "    y_pred_bag = bag_gbm.predict(X_train_set)\n",
    "    \n",
    "    #only change in function: include input vectors in training dataframe\n",
    "    preds_df = pd.DataFrame(X_train_set, columns = cali_housing.feature_names)\n",
    "    \n",
    "    preds_df['nn'] = y_pred_nn\n",
    "    preds_df['gbt'] = y_pred_gbt\n",
    "    preds_df['bag'] = y_pred_bag\n",
    " \n",
    "    return preds_df\n",
    "\n",
    "def predict_from_X_set_sp(X_train_set_in):\n",
    "    X_train_set = X_train_set_in.copy()\n",
    "\n",
    "    #change final estimator's name to final_etr_sp and use handle_X_set_sp within this function\n",
    "    return final_etr_sp.predict(handle_X_set_sp(X_train_set))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'max_features': 'log2',\n",
       " 'min_samples_leaf': 2,\n",
       " 'n_estimators': 100,\n",
       " 'oob_score': False}"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "preds_df_sp = handle_X_set_sp(X_stack)\n",
    "from sklearn.ensemble import ExtraTreesRegressor\n",
    "from sklearn.model_selection import RandomizedSearchCV\n",
    "\n",
    "param_dist = {'max_features' : ['sqrt','log2',1.0],\n",
    " 'min_samples_leaf' : [1, 2, 3, 7, 11],\n",
    " 'n_estimators': [50, 100],\n",
    " 'oob_score': [True, False]}\n",
    "\n",
    "pre_gs_inst_2 = RandomizedSearchCV(ExtraTreesRegressor(warm_start=True,bootstrap=True,random_state=7),\n",
    " param_distributions = param_dist,\n",
    " cv=3,\n",
    " n_iter = 15,random_state=7)\n",
    "\n",
    "pre_gs_inst_2.fit(preds_df_sp.values, y_stack)\n",
    "pre_gs_inst_2.best_params_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "final_etr_sp = ExtraTreesRegressor(**{'max_features': 'log2',\n",
    " 'min_samples_leaf': 2,\n",
    " 'n_estimators': 3000,\n",
    " 'oob_score': False,'random_state':7}).fit(preds_df_sp.values, y_stack)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.83158654580488089"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.model_selection import cross_val_score\n",
    "\n",
    "cross_val_score(final_etr_sp, preds_df_sp.values, y_stack, cv=3).mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "R-squared 0.847505166521\n",
      "MAE :  0.293429491508\n",
      "MAPE :  0.161770823792\n"
     ]
    }
   ],
   "source": [
    "y_pred = predict_from_X_set_sp(X_test_prin)\n",
    "\n",
    "from sklearn.metrics import r2_score, mean_absolute_error\n",
    "\n",
    "print \"R-squared\",r2_score(y_test_prin, y_pred)\n",
    "print \"MAE : \",mean_absolute_error(y_test_prin, y_pred)\n",
    "print \"MAPE : \",(np.abs(y_test_prin- y_pred)/y_test_prin).mean()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
